TODO

library(ggplot2)
library(data.table)
library(dplyr)

Load all good candidates for all GAGA species (1st filter: Take only good candidates from both databases)

Load all tsv files for the different GAGA ids in a list of data frames.

#tsv2load<-"/Users/lukas/sciebo/Projects/LGT/results/GAGA.LGT/*/results/*.*.lgt.good.candidates.tsv"
tsv2load<-"/Users/Janina/sciebo/GAGA.LGT/*/results/*.*.lgt.good.candidates.tsv"
#tsv2load<-"/global/scratch2/j_rink02/master/lgt/0_data/*/results/*.*.lgt.good.candidates.tsv"
dataFiles <- lapply(Sys.glob(tsv2load), read.csv,sep="\t")

add the name

Add the GAGA id as a name to the different list elements. Add the database origin as a name to the different list elements. Set the GAGA-id and database origin separated with a “.” as a name for each element of the list “dataFiles”. Each element in the list is one genome, either from the noAnt or the eukaryotic database including the candidates.

ids<-gsub(".*/(.+-[0-9]+)\\..+.lgt.good.candidates.tsv","\\1",Sys.glob(tsv2load))
type<-gsub(".*/(.+-[0-9]+)\\.(.+).lgt.good.candidates.tsv","\\2",Sys.glob(tsv2load))
names(dataFiles)<-paste(ids,type,sep=".")

Combine all GAGA ids

Combine tsv files from all GAGA ids in the large list element (“dataFiles”) to one big data frame (“df”).


df<-rbindlist(dataFiles,idcol = T, fill=TRUE) #the fill=TRUE option fills missing columns 

The unfiltered df containes 13664 candidates from all 162 selected genomes together. Each row in the dataframe represents one candidate.

head(df)

Add scaffold length for every genome

  1. Add the scaffolds with their lengths for every genome from the file “genome.file”.
  2. Create a large dataframe with scaffold length information for every genome called “dscf”
  3. Merge the scaffold information together with the information for the LGTs
#data2load<-"/Users/lukas/sciebo/Projects/LGT/results/GAGA.LGT/*/results/genome.file"
data2load<-"/Users/Janina/sciebo/GAGA.LGT/*/results/genome.file"  # load data from all genome.files
#data2load<-"/global/scratch2/j_rink02/master/lgt/0_data/*/results/genome.file"  # load data from all genome.files
genomeFiles <- lapply(Sys.glob(data2load), read.csv,sep="\t",header=F) # create large list of all genome.files

# Create a large df with information on scaffold number and length for every genome
genome_ids<-gsub(".*\\/(.*)\\/results\\/genome.file","\\1",Sys.glob(data2load))
names(genomeFiles)<-paste(genome_ids)

dscf<-rbindlist(genomeFiles,idcol = T)

colnames(dscf)
[1] ".id" "V1"  "V2" 
names(dscf)[2]<-"cand.scaffold" #rename the second column of dscf to merge later by this column
names(dscf)[1]<-"GAGA_id"

df$GAGA_id<-gsub("\\..*","",df$.id) #separate the first row of df to merge by column "GAGA_id"


# Merge dscf together with the information for every LGT candidate from df
df<-left_join(df, dscf, by = c("GAGA_id","cand.scaffold"))
names(df)[55]<-"scaffold.length"
df$V2<- NULL
Column 'V2' does not exist to remove
head(df)

extract eval

  1. Extract the evalue of the best prokaryotic hit (take column “bestProHit”. In this column, the evalue is located
    between other values and will be extracted in the next command). Make a new data table (called “bestblasthits”), separating the original column bestProHit into different columns, with the each value being separated by “;”.

  2. Add a column called “besteval” to the df, taking only the eval (column V4) from the bestblasthits dataframe.

bestblasthits<-read.csv(text=as.character(df$bestProHit),sep = ";",as.is = T,fill = T,blank.lines.skip = F,header=F)
df$besteval<-bestblasthits$V4

add column with length of candidate

df$cand.length<-df$cand.end-df$cand.start

extract broad locus start and stop

To cluster candidates in one close region together and view them as one single LGT, take the broad locus.

  1. The broad locus (columns “scaffold”, start“,”end" or summarized together in column “locus” of df) clusters all candidates within +/- 20kb region together and assigns them the same start and stop codon. Thereby, single candidates can be grouped by the same start and stop codon in the broad locus region. Separate the column “locus” from the df into scaffold (V1), start (V2) and end (V3) in the new dataframe “loci”.

  2. Add a column called “locus.start”, “locus.end” and “locus.length” to the df, taking the now separated columns from the new dataframe “loci”.

loci<-read.csv(text=gsub(":","-",df$locus),sep = "-",as.is = T,fill = T,blank.lines.skip = F,header=F)
df$locus.start<-loci$V2
df$locus.end<-loci$V3
df$locus.length<-df$locus.end-df$locus.start

plot overview plots

plot histograms of different metrics to see how they are distributed.

# gc-content
ggplot(df, aes(x=gc)) +
  geom_histogram(fill="red", alpha=0.5, position="identity")+theme_classic()


# entropy (ce)
ggplot(df, aes(x=ce)) +
  geom_histogram(fill="red", alpha=0.5, position="identity")+theme_classic()


# ct4
ggplot(df, aes(x=ct4)) +
  geom_histogram(fill="red", alpha=0.5, position="identity")+theme_classic()


# ct6
ggplot(df, aes(x=ct6)) +
  geom_histogram(fill="red", alpha=0.5, position="identity")+theme_classic()


# length of broad locus
ggplot(df, aes(x=locus.length)) +
  geom_histogram(fill="red", alpha=0.5, position="identity")+theme_classic()


# length of candidate
ggplot(df, aes(x=cand.length)) +
  geom_histogram(fill="red", alpha=0.5, position="identity")+theme_classic()


# log of best e-value
df$log.besteval<- -log(df$besteval,10)
df$log.besteval[df$log.besteval==Inf]<-max(df$log.besteval[df$log.besteval!=Inf])+1
ggplot(df, aes(x=df$log.besteval)) +
  geom_histogram(fill="red", alpha=0.5, position="identity")+theme_classic()


# BitDiffSum
ggplot(df, aes(x=BitDiffSum)) +
  geom_histogram(fill="red", alpha=0.5, position="identity")+theme_classic()

Filter the candidate dataset to exclude false-positives and missassemblies

Filter by evalue (evalue > 1e-25), ct4 (ct4>0.25), ce (ce>1.5), BitDiffSum (BitDiffSum>150) and the length of the candidate (length>100)

dfFilter<-subset(df,log.besteval>25 & ct4>0.25 & ce>1.5 & BitDiffSum>150 & cand.end-cand.start>100)

Filtering by the above criteria reduces the number of candidates to 5355 candidates (originally: 13664) in all 162 genomes.

Remove anything at the beginning of a scaffold and candidates with a length over 20 kb. These are more likely to be misassemblies. (Note that this still leaves in candidates at the very end of scaffolds, which also are more likely misassemblies).

dfFilter2<-subset(dfFilter,locus.start > 1000 & cand.length < 20000)

Filtering by the above criteria reduces the number of candidates further to 3631 candidates in all 162 genomes.

Remove anything at the end of a scaffold.

# Filter the dataframe to remove candidates at the end of the scaffold       
dfFilter3<-subset(dfFilter2, cand.end != scaffold.length)

This further reduces the number of candidates to 3610 possible LGTs in all 162 genomes.

Summarize candidates in close proximity into one larger locus

One row per broad start/stop coordinate

required libraries

library(dplyr)
library(ggrepel)
  1. Create a data frame that has in each row one larger locus (often containing several lgt candidates).
# keep one row per larger locus and paste together all the info for the different LGT candidates contained in this locus

## https://stackoverflow.com/questions/40033625/concatenating-all-rows-within-a-group-using-dplyr/40033725
dfC <- dplyr::group_by(df, locus) %>%
        dplyr::summarise_each(funs(paste(unique(.), collapse = ";")))

# filter this by locus dataframe to only keep loci that have at least one good candidate (i.e. that was contained in the dfFilter3 dataframe)
dfC.filtered<-subset(dfC,locus %in% unique(dfFilter3$locus),select=c(locus,.id,cand.locus,cand.start,cand.end,bestProHit,BitDiffSum,scaffold,start,end,gc,gcs,ce))

# save data frame to file
#write.table(dfC.filtered,"/Users/lukas/sciebo/Projects/LGT/results/GAGA.LGT.filtered.tsv",sep="\t",quote = F,row.names = F)
write.table(dfC.filtered,"/global/scratch2/j_rink02/master/lgt/2_analysis/GAGA.LGT.162.filtered.tsv",sep="\t",quote = F,row.names = F)

# for plotting
# split the unfiltered large df dataframe by "locus" into a list of dataframes (one list element per locus)
dfSplit<-split(df,f=paste(df$.id,df$locus,sep="."))

# filter the list of dataframes to only retain those that contain a LGT from the dfFilter3 dataframe
dfSplit.filtered<-dfSplit[unique(paste(dfFilter3$.id,dfFilter3$locus,sep="."))]

Merging candidates in close proximity together and filtering by the above criteria to only obtain candidates which are also in the dfFilter3 dataframe reduces the number of candidates further to 928 candidates in all 162 genomes.

Note: The filtered.tsv file only contains candidates, which remain after filtering, but the simple plots created by the above commands contain all candidates in this region, even if they would have been filtered out before.

Plot each locus

# Define a function containing a ggplot command. This function will be applied to each element of dfSplit.filtered (the list of dataframes)
plotLGTlocus<-function(locusData){
    locusData$logeval<- -log(locusData$besteval,10)
    locusData$logeval[!is.finite(locusData$logeval)]<- 350
    locusData$species<-substr(gsub(".*;","",locusData$bestProHit),1,20)
    ggplotLGT<-ggplot(locusData)+
          geom_rect(aes(xmin=cand.start,xmax=cand.end,ymin=1,ymax=0,fill=logeval),size=0)+
          coord_cartesian(xlim=c(min(locusData$locus.start),max(locusData$locus.end)),ylim=c(0,5))+
          geom_text_repel(
            aes(x=cand.start,y=1,label=species),
            force_pull   = 0, # do not pull toward data points
            nudge_y      = 0.5,
            direction    = "x",
            angle        = 90,
            hjust        = 0,
            segment.size = 0.2,
            max.iter = 1e4, max.time = 1
            )+
          scale_fill_gradient(low="steelblue",high = "red",limits = c(20,350),na.value = "grey90")+
          ggtitle(locusData$.id[1])+
          theme_classic()+
          xlab(locusData$locus[1])+
          guides(y="none")+
          ylab("")+
          theme(legend.position="right")
    return(ggplotLGT)
  }
# test the plotting function
plotLGTlocus(dfSplit.filtered[[180]])
# run plotting function over all elements in the dfSplit.filtered list, i.e. over all loci.
list.of.plots<-lapply(dfSplit.filtered,FUN=plotLGTlocus)

# save all plots (adjust path to your system)
#lapply(1:length(list.of.plots), function(i){
      #ggsave(filename=paste0("/Users/lukas/sciebo/Projects/LGT/results/LGT.filtered/",gsub(":","-",names(list.of.plots)[i]),".pdf"), plot=list.of.plots[[i]])
#  })

lapply(1:length(list.of.plots), function(i){
      ggsave(filename=paste0("/global/scratch2/j_rink02/master/lgt/2_analysis/LGT.filtered/",gsub(":","-",names(list.of.plots)[i]),".pdf"), plot=list.of.plots[[i]])
  })

Plot the LGT filters

# Assign names to x
x <- c( "original", "1st_filtering_step", "2nd_filtering_step", "3rd_filtering_step", "merging_of_loci","manual_investigation")
# Assign names to y
y <- c( 13664, 5355, 3631, 3610, 928, 410)
# Assign x to "Filtering_step" as column name
# Assign y to "nr_candidates" as column name
filtering_df <- data.frame( "Filtering_step" = x, "Nr_candidates" = y)
# Print the data frame
filtering_df
library(forcats)

p0 <- ggplot(filtering_df, aes(x=reorder(Filtering_step, -Nr_candidates), y=Nr_candidates, fill=Nr_candidates)) +
  geom_bar(stat="identity")+theme_classic()+
  coord_flip()
p0


# flipped version
p1 <- ggplot(filtering_df, aes(x = reorder(Filtering_step, Nr_candidates), y = Nr_candidates)) +
  geom_bar(aes(fill = as.factor(Nr_candidates)), stat="identity", width = 0.4) + theme_classic() +
  geom_text(aes(label=Nr_candidates), hjust=1.3, color = "white", position=position_dodge(width=1.0), size = 3) +
  scale_fill_gradient(low="darkgreen", high="darkblue") +
  theme(legend.position = "none") +
  coord_flip() +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_fill_viridis_d(breaks = rev, direction = -1)+
  scale_y_continuous(position = "right", breaks = seq(0,14000, 2000)) +
  labs(title = "Filtering of predicted LGT candidates", x = NULL, y="Number of candidates")
Scale for 'fill' is already present. Adding another scale for 'fill', which will replace the existing scale.
p1 


setwd("/Users/Janina/Documents/MASTER/Masterarbeit/plots/self_produced")
The working directory was changed to /Users/Janina/Documents/MASTER/Masterarbeit/plots/self_produced inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
library(export)
graph2ppt(file="LGT_filtering")
Exported graph as LGT_filtering.pptx
ggsave("LGT_filtering.png", path = "/Users/Janina/Documents/MASTER/Masterarbeit/plots/self_produced")
Saving 7 x 7 in image

Last changed 2021/07/19

---
title: "R Notebook: Analyse LGT candidates"
output: html_notebook
---


## TODO
- group by start/end - done -
- plot by large candidate -done -
- include noAnt - done -
- set thresholds for evaluation of candidates - done -

```{r}
library(ggplot2)
library(data.table)
library(dplyr)
```

# Load all good candidates for all GAGA species (1st filter: Take only good candidates from both databases)

Load all tsv files for the different GAGA ids in a list of data frames.
```{r}
#tsv2load<-"/Users/lukas/sciebo/Projects/LGT/results/GAGA.LGT/*/results/*.*.lgt.good.candidates.tsv"
tsv2load<-"/Users/Janina/sciebo/GAGA.LGT/*/results/*.*.lgt.good.candidates.tsv"
#tsv2load<-"/global/scratch2/j_rink02/master/lgt/0_data/*/results/*.*.lgt.good.candidates.tsv"
dataFiles <- lapply(Sys.glob(tsv2load), read.csv,sep="\t")
```

# add the name
Add the GAGA id as a name to the different list elements.
Add the database origin as a name to the different list elements.
Set the GAGA-id and database origin separated with a "." as a name for each element of the list "dataFiles".
Each element in the list is one genome, either from the noAnt or the eukaryotic database including the candidates.
```{r}
ids<-gsub(".*/(.+-[0-9]+)\\..+.lgt.good.candidates.tsv","\\1",Sys.glob(tsv2load))
type<-gsub(".*/(.+-[0-9]+)\\.(.+).lgt.good.candidates.tsv","\\2",Sys.glob(tsv2load))
names(dataFiles)<-paste(ids,type,sep=".")
```


# Combine all GAGA ids
Combine tsv files from all GAGA ids in the large list element ("dataFiles") to one big data frame ("df").
```{r}

df<-rbindlist(dataFiles,idcol = T, fill=TRUE) #the fill=TRUE option fills missing columns 
```

The unfiltered df containes 13664 candidates from all 162 selected genomes together.
Each row in the dataframe represents one candidate.
```{r}
head(df)
```


# Add scaffold length for every genome
1. Add the scaffolds with their lengths for every genome from the file "genome.file".
2. Create a large dataframe with scaffold length information for every genome called "dscf"
3. Merge the scaffold information together with the information for the LGTs
```{r}
#data2load<-"/Users/lukas/sciebo/Projects/LGT/results/GAGA.LGT/*/results/genome.file"
data2load<-"/Users/Janina/sciebo/GAGA.LGT/*/results/genome.file"  # load data from all genome.files
#data2load<-"/global/scratch2/j_rink02/master/lgt/0_data/*/results/genome.file"  # load data from all genome.files
genomeFiles <- lapply(Sys.glob(data2load), read.csv,sep="\t",header=F) # create large list of all genome.files

# Create a large df with information on scaffold number and length for every genome
genome_ids<-gsub(".*\\/(.*)\\/results\\/genome.file","\\1",Sys.glob(data2load))
names(genomeFiles)<-paste(genome_ids)

dscf<-rbindlist(genomeFiles,idcol = T)

colnames(dscf)
names(dscf)[2]<-"cand.scaffold" #rename the second column of dscf to merge later by this column
names(dscf)[1]<-"GAGA_id"

df$GAGA_id<-gsub("\\..*","",df$.id) #separate the first row of df to merge by column "GAGA_id"


# Merge dscf together with the information for every LGT candidate from df
df<-left_join(df, dscf, by = c("GAGA_id","cand.scaffold"))
names(df)[55]<-"scaffold.length"
df$V2<- NULL
head(df)
```

# extract eval
1. Extract the evalue of the best prokaryotic hit (take column "bestProHit". In this column, the evalue is located  
   between other values and will be extracted in the next command).
   Make a new data table (called "bestblasthits"), separating the original column bestProHit into different columns,    with the each value being separated by ";".

2. Add a column called "besteval" to the df, taking only the eval (column V4) from the bestblasthits dataframe.

```{r}
bestblasthits<-read.csv(text=as.character(df$bestProHit),sep = ";",as.is = T,fill = T,blank.lines.skip = F,header=F)
df$besteval<-bestblasthits$V4
```

# add column with length of candidate

```{r}
df$cand.length<-df$cand.end-df$cand.start
```
# extract broad locus start and stop
To cluster candidates in one close region together and view them as one single LGT, take the broad locus.

1. The broad locus (columns "scaffold", start","end" or summarized together in column "locus" of df) clusters all
   candidates within +/- 20kb region together and assigns them the same start and stop codon. Thereby, single
   candidates can be grouped by the same start and stop codon in the broad locus region.
   Separate the column "locus" from the df into scaffold (V1), start (V2) and end (V3) in the new dataframe "loci".

2. Add a column called "locus.start", "locus.end" and "locus.length" to the df, taking the now separated columns
   from the new dataframe "loci".
```{r}
loci<-read.csv(text=gsub(":","-",df$locus),sep = "-",as.is = T,fill = T,blank.lines.skip = F,header=F)
df$locus.start<-loci$V2
df$locus.end<-loci$V3
df$locus.length<-df$locus.end-df$locus.start
```

# plot overview plots
plot histograms of different metrics to see how they are distributed.
```{r}
# gc-content
ggplot(df, aes(x=gc)) +
  geom_histogram(fill="red", alpha=0.5, position="identity")+theme_classic()

# entropy (ce)
ggplot(df, aes(x=ce)) +
  geom_histogram(fill="red", alpha=0.5, position="identity")+theme_classic()

# ct4
ggplot(df, aes(x=ct4)) +
  geom_histogram(fill="red", alpha=0.5, position="identity")+theme_classic()

# ct6
ggplot(df, aes(x=ct6)) +
  geom_histogram(fill="red", alpha=0.5, position="identity")+theme_classic()

# length of broad locus
ggplot(df, aes(x=locus.length)) +
  geom_histogram(fill="red", alpha=0.5, position="identity")+theme_classic()

# length of candidate
ggplot(df, aes(x=cand.length)) +
  geom_histogram(fill="red", alpha=0.5, position="identity")+theme_classic()

# log of best e-value
df$log.besteval<- -log(df$besteval,10)
df$log.besteval[df$log.besteval==Inf]<-max(df$log.besteval[df$log.besteval!=Inf])+1
ggplot(df, aes(x=df$log.besteval)) +
  geom_histogram(fill="red", alpha=0.5, position="identity")+theme_classic()

# BitDiffSum
ggplot(df, aes(x=BitDiffSum)) +
  geom_histogram(fill="red", alpha=0.5, position="identity")+theme_classic()
```


# Filter the candidate dataset to exclude false-positives and missassemblies

Filter by evalue (evalue > 1e-25), ct4 (ct4>0.25), ce (ce>1.5), BitDiffSum (BitDiffSum>150) and the length of the candidate (length>100)

```{r}
dfFilter<-subset(df,log.besteval>25 & ct4>0.25 & ce>1.5 & BitDiffSum>150 & cand.end-cand.start>100)
```
Filtering by the above criteria reduces the number of candidates to 5355 candidates (originally: 13664) in all 162 genomes.


Remove anything at the beginning of a scaffold and candidates with a length over 20 kb.
These are more likely to be misassemblies. (Note that this still leaves in candidates at the very end of scaffolds, which also are more likely misassemblies).
```{r}
dfFilter2<-subset(dfFilter,locus.start > 1000 & cand.length < 20000)
```
Filtering by the above criteria reduces the number of candidates further to 3631 candidates in all 162 genomes.

Remove anything at the end of a scaffold.
```{r}
# Filter the dataframe to remove candidates at the end of the scaffold       
dfFilter3<-subset(dfFilter2, cand.end != scaffold.length)
```
This further reduces the number of candidates to 3610 possible LGTs in all 162 genomes.


# Summarize candidates in close proximity into one larger locus
One row per broad start/stop coordinate

## required libraries
```{r}
library(dplyr)
library(ggrepel)
```

1. Create a data frame that has in each row one larger locus (often containing several lgt candidates).
```{r}
# keep one row per larger locus and paste together all the info for the different LGT candidates contained in this locus

## https://stackoverflow.com/questions/40033625/concatenating-all-rows-within-a-group-using-dplyr/40033725
dfC <- dplyr::group_by(df, locus) %>%
        dplyr::summarise_each(funs(paste(unique(.), collapse = ";")))

# filter this by locus dataframe to only keep loci that have at least one good candidate (i.e. that was contained in the dfFilter3 dataframe)
dfC.filtered<-subset(dfC,locus %in% unique(dfFilter3$locus),select=c(locus,.id,cand.locus,cand.start,cand.end,bestProHit,BitDiffSum,scaffold,start,end,gc,gcs,ce))

# save data frame to file
#write.table(dfC.filtered,"/Users/lukas/sciebo/Projects/LGT/results/GAGA.LGT.filtered.tsv",sep="\t",quote = F,row.names = F)
write.table(dfC.filtered,"/global/scratch2/j_rink02/master/lgt/2_analysis/GAGA.LGT.162.filtered.tsv",sep="\t",quote = F,row.names = F)

# for plotting
# split the unfiltered large df dataframe by "locus" into a list of dataframes (one list element per locus)
dfSplit<-split(df,f=paste(df$.id,df$locus,sep="."))

# filter the list of dataframes to only retain those that contain a LGT from the dfFilter3 dataframe
dfSplit.filtered<-dfSplit[unique(paste(dfFilter3$.id,dfFilter3$locus,sep="."))]


```
Merging candidates in close proximity together and filtering by the above criteria to only obtain candidates which are also in the dfFilter3 dataframe reduces the number of candidates further to 928 candidates in all 162 genomes.

Note: The filtered.tsv file only contains candidates, which remain after filtering, but the simple plots created by the above commands contain all candidates in this region, even if they would have been filtered out before.

# Plot each locus

```{r}
# Define a function containing a ggplot command. This function will be applied to each element of dfSplit.filtered (the list of dataframes)
plotLGTlocus<-function(locusData){
    locusData$logeval<- -log(locusData$besteval,10)
    locusData$logeval[!is.finite(locusData$logeval)]<- 350
    locusData$species<-substr(gsub(".*;","",locusData$bestProHit),1,20)
    ggplotLGT<-ggplot(locusData)+
          geom_rect(aes(xmin=cand.start,xmax=cand.end,ymin=1,ymax=0,fill=logeval),size=0)+
          coord_cartesian(xlim=c(min(locusData$locus.start),max(locusData$locus.end)),ylim=c(0,5))+
          geom_text_repel(
            aes(x=cand.start,y=1,label=species),
            force_pull   = 0, # do not pull toward data points
            nudge_y      = 0.5,
            direction    = "x",
            angle        = 90,
            hjust        = 0,
            segment.size = 0.2,
            max.iter = 1e4, max.time = 1
            )+
          scale_fill_gradient(low="steelblue",high = "red",limits = c(20,350),na.value = "grey90")+
          ggtitle(locusData$.id[1])+
          theme_classic()+
          xlab(locusData$locus[1])+
          guides(y="none")+
          ylab("")+
          theme(legend.position="right")
    return(ggplotLGT)
  }
```

```{r}
# test the plotting function
plotLGTlocus(dfSplit.filtered[[180]])
```


```{r}
# run plotting function over all elements in the dfSplit.filtered list, i.e. over all loci.
list.of.plots<-lapply(dfSplit.filtered,FUN=plotLGTlocus)

# save all plots (adjust path to your system)
#lapply(1:length(list.of.plots), function(i){
      #ggsave(filename=paste0("/Users/lukas/sciebo/Projects/LGT/results/LGT.filtered/",gsub(":","-",names(list.of.plots)[i]),".pdf"), plot=list.of.plots[[i]])
#  })

lapply(1:length(list.of.plots), function(i){
      ggsave(filename=paste0("/global/scratch2/j_rink02/master/lgt/2_analysis/LGT.filtered/",gsub(":","-",names(list.of.plots)[i]),".pdf"), plot=list.of.plots[[i]])
  })
```



Plot the LGT filters 
```{r}
# Assign names to x
x <- c( "original", "1st_filtering_step", "2nd_filtering_step", "3rd_filtering_step", "merging_of_loci","manual_investigation")
# Assign names to y
y <- c( 13664, 5355, 3631, 3610, 928, 410)
# Assign x to "Filtering_step" as column name
# Assign y to "nr_candidates" as column name
filtering_df <- data.frame( "Filtering_step" = x, "Nr_candidates" = y)
# Print the data frame
filtering_df
```


```{r}
library(forcats)

p0 <- ggplot(filtering_df, aes(x=reorder(Filtering_step, -Nr_candidates), y=Nr_candidates, fill=Nr_candidates)) +
  geom_bar(stat="identity")+theme_classic()+
  coord_flip()
p0

# flipped version
p1 <- ggplot(filtering_df, aes(x = reorder(Filtering_step, Nr_candidates), y = Nr_candidates)) +
  geom_bar(aes(fill = as.factor(Nr_candidates)), stat="identity", width = 0.4) + theme_classic() +
  geom_text(aes(label=Nr_candidates), hjust=1.3, color = "white", position=position_dodge(width=1.0), size = 3) +
  scale_fill_gradient(low="darkgreen", high="darkblue") +
  theme(legend.position = "none") +
  coord_flip() +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_fill_viridis_d(breaks = rev, direction = -1)+
  scale_y_continuous(position = "right", breaks = seq(0,14000, 2000)) +
  labs(title = "Filtering of predicted LGT candidates", x = NULL, y="Number of candidates")
p1 

setwd("/Users/Janina/Documents/MASTER/Masterarbeit/plots/self_produced")
library(export)
graph2ppt(file="LGT_filtering")
```
```{r}

ggsave("LGT_filtering.png", path = "/Users/Janina/Documents/MASTER/Masterarbeit/plots/self_produced")
```
Last changed 2021/07/19
